package com.trytech.mongoocrawler.client.parser;

import com.trytech.mongoocrawler.client.entity.LianjiaItem;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

/**
 * 从网页中解析出链家房源的相关信息
 */
public class LianjiaHtmlParser extends HtmlParser {
    public static List<LianjiaItem> parse(String html){
        List<LianjiaItem> itemList = new LinkedList<LianjiaItem>();
        Document doc = Jsoup.parse(html);
        doc.charset(Charset.forName("UTF-8"));
        Element body = doc.body();
        Elements elements = body.getElementsByClass("sellListContent");
        Element listEle = elements.first();
        Iterator<Element> ite = listEle.children().iterator();
        while(ite.hasNext()){
            LianjiaItem item = new LianjiaItem();
            Element ele = ite.next();
            Element titleDiv = ele.getElementsByClass("title").first();
            Element titleEle = titleDiv.children().first();
            String title = titleEle.ownText();
            item.setTitle(title);

            Element addressDiv = ele.getElementsByClass("houseInfo").first();
            Element addressEle = addressDiv.getElementsByTag("a").first();
            String address = addressEle.text();
            item.setLocation(address);

            String addressText = addressDiv.text();
            String[] subAddress = addressText.split("\\|");
            String type = subAddress[1];
            item.setType(type);
            item.setFloorSpace(subAddress[2]);

            Element priceDiv = ele.getElementsByClass("totalPrice").first();
            Element priceEle = priceDiv.children().first();
            item.setPrice(Float.parseFloat(priceEle.ownText()));

            Element unitPriceDiv = ele.getElementsByClass("unitPrice").first();
            Element unitPriceEle = unitPriceDiv.children().first();
            item.setUnitPrice(Integer.parseInt(unitPriceEle.ownText().replaceAll("[^x00-xff]*","").trim()));

            itemList.add(item);
        }
        return itemList;
    }
}
